{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+---------+-----------+-----------------+------------------+---------------------+----------------+\n",
      "| area|perimeter|compactness| length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|\n",
      "+-----+---------+-----------+-----------------+------------------+---------------------+----------------+\n",
      "|15.26|    14.84|      0.871|            5.763|             3.312|                2.221|            5.22|\n",
      "|14.88|    14.57|     0.8811|5.553999999999999|             3.333|                1.018|           4.956|\n",
      "|14.29|    14.09|      0.905|            5.291|3.3369999999999997|                2.699|           4.825|\n",
      "+-----+---------+-----------+-----------------+------------------+---------------------+----------------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "\n",
    "spark = SparkSession.builder.appName('cluster').getOrCreate()\n",
    "df = spark.read.csv('seeds_dataset.csv', inferSchema=True, header=True)\n",
    "df.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- area: double (nullable = true)\n",
      " |-- perimeter: double (nullable = true)\n",
      " |-- compactness: double (nullable = true)\n",
      " |-- length_of_kernel: double (nullable = true)\n",
      " |-- width_of_kernel: double (nullable = true)\n",
      " |-- asymmetry_coefficient: double (nullable = true)\n",
      " |-- length_of_groove: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+\n",
      "| area|perimeter|compactness| length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|            features|\n",
      "+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+\n",
      "|15.26|    14.84|      0.871|            5.763|             3.312|                2.221|            5.22|[15.26,14.84,0.87...|\n",
      "|14.88|    14.57|     0.8811|5.553999999999999|             3.333|                1.018|           4.956|[14.88,14.57,0.88...|\n",
      "|14.29|    14.09|      0.905|            5.291|3.3369999999999997|                2.699|           4.825|[14.29,14.09,0.90...|\n",
      "+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.ml.feature import VectorAssembler\n",
    "from pyspark.ml.clustering import KMeans\n",
    "\n",
    "assembler = VectorAssembler(inputCols = df.columns, outputCol = 'features')\n",
    "final_df = assembler.transform(df)\n",
    "final_df.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- area: double (nullable = true)\n",
      " |-- perimeter: double (nullable = true)\n",
      " |-- compactness: double (nullable = true)\n",
      " |-- length_of_kernel: double (nullable = true)\n",
      " |-- width_of_kernel: double (nullable = true)\n",
      " |-- asymmetry_coefficient: double (nullable = true)\n",
      " |-- length_of_groove: double (nullable = true)\n",
      " |-- features: vector (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "final_df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+\n",
      "| area|perimeter|compactness| length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|            features|      scaledFeatures|\n",
      "+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+\n",
      "|15.26|    14.84|      0.871|            5.763|             3.312|                2.221|            5.22|[15.26,14.84,0.87...|[5.24452795332028...|\n",
      "|14.88|    14.57|     0.8811|5.553999999999999|             3.333|                1.018|           4.956|[14.88,14.57,0.88...|[5.11393027165175...|\n",
      "|14.29|    14.09|      0.905|            5.291|3.3369999999999997|                2.699|           4.825|[14.29,14.09,0.90...|[4.91116018695588...|\n",
      "+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.ml.feature import StandardScaler\n",
    "\n",
    "scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')\n",
    "scaler_model = scaler.fit(final_df)\n",
    "final_df = scaler_model.transform(final_df)\n",
    "final_df.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df.take(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans = KMeans(featuresCol = 'scaledFeatures', k=3)\n",
    "model = kmeans.fit(final_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WSSSE: 428.60820118716356\n"
     ]
    }
   ],
   "source": [
    "print('WSSSE:', model.computeCost(final_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,\n",
      "        3.15410901, 10.38031464]), array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,\n",
      "        2.41585013, 12.29286107]), array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,\n",
      "        1.80061978, 10.41913733])]\n"
     ]
    }
   ],
   "source": [
    "centers = model.clusterCenters()\n",
    "print(centers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+----------+\n",
      "|      scaledFeatures|prediction|\n",
      "+--------------------+----------+\n",
      "|[5.24452795332028...|         2|\n",
      "|[5.11393027165175...|         2|\n",
      "|[4.91116018695588...|         2|\n",
      "|[4.75650503761158...|         2|\n",
      "|[5.54696468981581...|         2|\n",
      "|[4.94209121682475...|         2|\n",
      "|[5.04863143081749...|         2|\n",
      "|[4.84929812721816...|         2|\n",
      "|[5.71536696354628...|         1|\n",
      "|[5.65006812271202...|         2|\n",
      "|[5.24452795332028...|         2|\n",
      "|[4.82180387844584...|         2|\n",
      "|[4.77368894309428...|         2|\n",
      "|[4.73588435103234...|         2|\n",
      "|[4.72213722664617...|         2|\n",
      "|[5.01426361985209...|         2|\n",
      "|[4.80805675405968...|         2|\n",
      "|[5.39230954047151...|         2|\n",
      "|[5.05206821191403...|         2|\n",
      "|[4.37158555479908...|         0|\n",
      "+--------------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "model.transform(final_df).select('scaledFeatures', 'prediction').show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
